import pandas as pd
import numpy as np
import pickle
from string import punctuation
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import plotly
import plotly.express as px
file_names = {
"df_es_mapping": "../../Data/mapping/df_es_mapping.pickle",
"df_us_mapping": "../../Data/mapping/df_us_mapping.pickle",
"df_es_test": "../../Data/test/df_es_test.pickle",
"df_us_test": "../../Data/test/df_us_test.pickle",
"df_es_train": "../../Data/train/df_es_train.pickle",
"df_us_train": "../../Data/train/df_us_train.pickle",
"df_es_trial": "../../Data/trial/df_es_trial.pickle",
"df_us_trial": "../../Data/trial/df_us_trial.pickle",
}
# mas imports
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer # tokenizer especial para tweets
tt = TweetTokenizer()
# nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation
import plotly as ply
import plotly.graph_objects as go
import numpy as np
cargar sets
df_es_train = pickle.load(open(file_names["df_es_train"], "rb"))
df_es_trial = pickle.load(open(file_names["df_es_trial"], "rb"))
df_es_test = pickle.load(open(file_names["df_es_test"], "rb"))
df_us_train = pickle.load(open(file_names["df_us_train"], "rb"))
df_us_trial = pickle.load(open(file_names["df_us_trial"], "rb"))
df_us_test = pickle.load(open(file_names["df_us_test"], "rb"))
pre-procesamiento
df_us_train['tokenized_text'] = df_us_train['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
df_us_train.head()
df_us_test['tokenized_text'] = df_us_test['text'].str.lower().apply(lambda x: " ".join(tt.tokenize(x)))
"""stopwords_en_withpunct = set(stopwords_en).union(set(punctuation))
print(list(stopwords_en_withpunct)[:10])""";
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5)
X_train_bow = vectorizer.fit_transform(df_us_train["tokenized_text"])
X_test_bow = vectorizer.transform(df_us_test["tokenized_text"])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_bow, df_us_train["label"])
clf.score(X_train_bow, df_us_train["label"])
from sklearn.metrics import classification_report
df_us_mapping = pickle.load(open(file_names["df_us_mapping"], "rb")).sort_values("label")
df_us_mapping
y_pred = clf.predict(X_test_bow)
print(classification_report(df_us_test["label"], y_pred, target_names=df_us_mapping["emoji"]))
vocab = {k: v for v, k in enumerate(vectorizer.get_feature_names_out())}
vec_test = np.zeros(X_train_bow.shape[1])
k = vocab["santa"]
vec_test[k] = 1
print(vectorizer.inverse_transform([vec_test])[0][0])
clf.predict_proba([vec_test])
%%time
vocab_length = X_train_bow.shape[1]
proba_matrix = np.array([clf.predict_proba(np.eye(1,vocab_length,k))[0] for k in range(vocab_length)])
print(vocab_length)
print(proba_matrix.shape)
una_linea = proba_matrix[:,3]
una_linea.shape
def topPalabras(proba_matrix,emoji_id,k=5):
# retorna las palabras para las cuales el emoji en cuestión tiene mas probabilidad
prob = proba_matrix[:,emoji_id] # mmm
ind = np.argpartition(prob,-k)[-k:]
val = prob[ind]
palabras = [vectorizer.inverse_transform([np.eye(1,vocab_length,k)[0]])[0][0] for k in ind]
return palabras, val
i = 9
map_emojis = [0,1,10,11,12,13,14,15,16,17,18,19,2,3,4,5,6,7,8,9]
print(df_us_mapping["emoji"][map_emojis[i]])
topPalabras(proba_matrix,i)
for i in range(20):
print(df_us_mapping["emoji"][map_emojis[i]])
pal, val = topPalabras(proba_matrix,i)
print(dict([(pal[j],val[j]) for j in range(len(pal))]))
!pip install umap-learn
import umap
reducer = umap.UMAP()
to_R2 = reducer.fit_transform(proba_matrix)
to_R2.shape
df_umap = pd.DataFrame(to_R2)
df_umap["token"] = vectorizer.get_feature_names_out()
df_umap["label"] = np.argmax(proba_matrix, axis=1).astype(str)
df_umap["proba"] = np.max(proba_matrix, axis=1)
df_umap = df_umap.merge(df_us_mapping, on="label", how="left")
df_umap
data = []
for label in df_us_mapping["label"]:
sub_df = df_umap[df_umap["label"] == label]
data.append(
go.Scattergl(
x = sub_df[0],
y = sub_df[1],
mode='markers',
text=sub_df["token"]+"<br>"+sub_df["emoji"]+"<br>"+sub_df["proba"].apply(lambda x: str(np.round(x, 3))),
name=sub_df["emoji"].iloc[0],
marker=dict(
size=25*sub_df["proba"],
line_width=0.2,
)
)
)
fig = go.Figure(data=data)
fig.update_layout(
title="Proyección (UMAP) de vectores de probabilidad de tokens",
autosize=False,
width=700,
height=500,
)
fig.show(renderer="notebook")